NYC Rat Sightings

Data Description

The NYC Rat Sightings dataset describes mostly the location where there are rat sightings, the dates when it happened and which type of building it happened on. In the original dataset there are also many empty columns or columns with only one value, which just means the data was organised poorly, and we will not use these.

We will analyse if sightings depend on the month they happen. Additionally, we will analyse a dataset that describes borough populations so we can analyse if population density compares to number of rat sightings and which boroughs have more density of rat sightings.

The dataset we will be using can be found on kaggle.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

Data cleaning

First, we create a function to use for splitting up the different dates.

date.split <- function(txtdate, x){
  date <- parse_date(txtdate, format = "%m/%d/%Y %I:%M:%S %p")
  time <- parse_time(txtdate, format = "%m/%d/%Y %I:%M:%S %p")
  year <- as.factor(format(date, format = "%Y"))
  month <- as.factor(format(date, format = "%m"))
  day <- as.numeric(format(date, format = "%d"))
  
  if (x == 1) {value <- year}
  else if (x == 2) {value <- month}
  else if (x == 3) {value <- day}
  else if (x == 4) {value <- time}
  
  return(value)
}

Secondly, we create a function to use when splitting a specific number from a data point, as used for both the address and the community boards.

#Second attempt at a function that could split "Incident_Address" and "Community Board":
get.number <- function(address){
  if(!identical(address, NA)){
    nums <- str_extract(address, "[0-9]+")
    return(nums)
  }
}

The dataset initially looks like this:

rat_dataset <-
  read_csv("Rat_Sightings.csv", show_col_types = FALSE)
## Warning: One or more parsing issues, see `problems()` for details
head(rat_dataset)
## # A tibble: 6 × 52
##   Uniqu…¹ Creat…² Close…³ Agency Agenc…⁴ Compl…⁵ Descr…⁶ Locat…⁷ Incid…⁸ Incid…⁹
##     <dbl> <chr>   <chr>   <chr>  <chr>   <chr>   <chr>   <chr>     <dbl> <chr>  
## 1  3.15e7 09/04/… 09/18/… DOHMH  Depart… Rodent  Rat Si… 3+ Fam…   10006 <NA>   
## 2  3.15e7 09/04/… 10/28/… DOHMH  Depart… Rodent  Rat Si… Commer…   10306 2270 H…
## 3  3.15e7 09/04/… <NA>    DOHMH  Depart… Rodent  Rat Si… 1-2 Fa…   10310 758 PO…
## 4  3.15e7 09/04/… 09/14/… DOHMH  Depart… Rodent  Rat Si… 3+ Fam…   11206 198 SC…
## 5  3.15e7 09/04/… 09/22/… DOHMH  Depart… Rodent  Rat Si… 3+ Fam…   10462 2138 W…
## 6  3.15e7 09/04/… 09/22/… DOHMH  Depart… Rodent  Rat Si… 3+ Fam…   11231 179 LU…
## # … with 42 more variables: `Street Name` <chr>, `Cross Street 1` <chr>,
## #   `Cross Street 2` <chr>, `Intersection Street 1` <chr>,
## #   `Intersection Street 2` <chr>, `Address Type` <chr>, City <chr>,
## #   Landmark <lgl>, `Facility Type` <chr>, Status <chr>, `Due Date` <chr>,
## #   `Resolution Action Updated Date` <chr>, `Community Board` <chr>,
## #   Borough <chr>, `X Coordinate (State Plane)` <dbl>,
## #   `Y Coordinate (State Plane)` <dbl>, `Park Facility Name` <chr>, …

with many empty features or with one single value. In this next step, we will be taking these features out and separating some interesting data in more features:

rat_dataset <-
  read_csv("Rat_Sightings.csv", show_col_types = FALSE) %>% 
  #select("Unique Key", "Created Date", "Closed Date", "Location Type", "Incident Zip", "Incident Address", "Street Name", "Cross Street 1", "Cross Street 2", "Intersection Street 1", "Intersection Street 2", "Address Type", "Status", "Due Date", "Resolution Action Updated Date", "Community Board", "Borough", "Latitude", "Longitude") %>% 
  rename(Unique_Key = "Unique Key", Created_Date = "Created Date", Closed_Date = "Closed Date", Location_Type = "Location Type", Incident_Zip = "Incident Zip", Incident_Address = "Incident Address", Street_Name = "Street Name", Cross_Street_1 = "Cross Street 1", Cross_Street_2 = "Cross Street 2", Intersection_Street_1 = "Intersection Street 1", Intersection_Street_2 = "Intersection Street 2", Address_Type = "Address Type", Status = "Status", Due_Date = "Due Date", RAU_Date = "Resolution Action Updated Date", Community_Board = "Community Board", Borough = "Borough", Latitude = "Latitude", Longitude = "Longitude") %>% 
  mutate(Created_Year = date.split(Created_Date, 1), Created_Month = date.split(Created_Date, 2), Created_Day = date.split(Created_Date, 3)) %>% 
  mutate(Closed_Year = date.split(Closed_Date, 1), Closed_Month = date.split(Closed_Date, 2), Closed_Day = date.split(Closed_Date, 3)) %>%
  mutate(Due_Year = date.split(Due_Date, 1), Due_Month = date.split(Due_Date, 2), Due_Day = date.split(Due_Date, 3), Due_Time = date.split(Due_Date, 4)) %>%
  mutate(RAU_Year = date.split(RAU_Date, 1), RAU_Month = date.split(RAU_Date, 2), RAU_Day = date.split(RAU_Date, 3), RAU_Time = date.split(RAU_Date, 4)) %>% 
  mutate(Address_Number = get.number(Incident_Address)) %>% 
  mutate(Community_Board_Number = get.number(Community_Board)) %>% 
  select(Unique_Key, Created_Year, Created_Month, Created_Day, Closed_Year, Closed_Month, Closed_Day, Location_Type, Incident_Zip, Address_Number, Street_Name, Cross_Street_1, Cross_Street_2, Intersection_Street_1, Intersection_Street_2, Address_Type, Status, Due_Year, Due_Month, Due_Day, Due_Time, RAU_Year, RAU_Month, RAU_Day, RAU_Time, Community_Board_Number, Borough, Latitude, Longitude)
## Warning: One or more parsing issues, see `problems()` for details

We also want to make sure that all of the features have the proper value type.

rat_dataset$Location_Type <- 
  as.factor(rat_dataset$Location_Type)
rat_dataset$Street_Name <-
  as.factor(rat_dataset$Street_Name)
rat_dataset$Cross_Street_1 <-
  as.factor(rat_dataset$Cross_Street_1)
rat_dataset$Cross_Street_2 <-
  as.factor(rat_dataset$Cross_Street_2)
rat_dataset$Address_Type <-
  as.factor(rat_dataset$Address_Type)
rat_dataset$Status <-
  as.factor(rat_dataset$Status)
rat_dataset$Borough <-
  as.factor(rat_dataset$Borough)

So, our starting point data is:

head(rat_dataset)
## # A tibble: 6 × 29
##   Unique_Key Created_Y…¹ Creat…² Creat…³ Close…⁴ Close…⁵ Close…⁶ Locat…⁷ Incid…⁸
##        <dbl> <fct>       <fct>     <dbl> <fct>   <fct>     <dbl> <fct>     <dbl>
## 1   31464015 2015        09            4 2015    09           18 3+ Fam…   10006
## 2   31464024 2015        09            4 2015    10           28 Commer…   10306
## 3   31464025 2015        09            4 <NA>    <NA>         NA 1-2 Fa…   10310
## 4   31464026 2015        09            4 2015    09           14 3+ Fam…   11206
## 5   31464027 2015        09            4 2015    09           22 3+ Fam…   10462
## 6   31464188 2015        09            4 2015    09           22 3+ Fam…   11231
## # … with 20 more variables: Address_Number <chr>, Street_Name <fct>,
## #   Cross_Street_1 <fct>, Cross_Street_2 <fct>, Intersection_Street_1 <chr>,
## #   Intersection_Street_2 <chr>, Address_Type <fct>, Status <fct>,
## #   Due_Year <fct>, Due_Month <fct>, Due_Day <dbl>, Due_Time <time>,
## #   RAU_Year <fct>, RAU_Month <fct>, RAU_Day <dbl>, RAU_Time <time>,
## #   Community_Board_Number <chr>, Borough <fct>, Latitude <dbl>,
## #   Longitude <dbl>, and abbreviated variable names ¹​Created_Year, …
summary(rat_dataset)
##    Unique_Key        Created_Year   Created_Month    Created_Day   
##  Min.   :11464394   2016   :17230   07     :11982   Min.   : 1.00  
##  1st Qu.:23414521   2015   :15272   08     :11656   1st Qu.: 8.00  
##  Median :28836804   2017   :14425   06     :11307   Median :16.00  
##  Mean   :28158637   2014   :12617   05     :10779   Mean   :15.69  
##  3rd Qu.:33460138   2013   :10739   09     : 9669   3rd Qu.:23.00  
##  Max.   :37197000   2012   :10643   04     : 8897   Max.   :31.00  
##                     (Other):20988   (Other):37624                  
##   Closed_Year     Closed_Month     Closed_Day   
##  2016   :15339   08     :10479   Min.   : 1.00  
##  2015   :13672   07     :10079   1st Qu.: 8.00  
##  2017   :12980   06     : 9595   Median :16.00  
##  2014   :12173   09     : 9278   Mean   :15.86  
##  2011   :10685   05     : 8305   3rd Qu.:23.00  
##  (Other):26134   (Other):43247   Max.   :31.00  
##  NA's   :10931   NA's   :10931   NA's   :10931  
##                       Location_Type    Incident_Zip    Address_Number    
##  3+ Family Apt. Building     :41061   Min.   :    83   Length:101914     
##  1-2 Family Dwelling         :19702   1st Qu.: 10086   Class :character  
##  Other (Explain Below)       :15044   Median : 10472   Mode  :character  
##  3+ Family Mixed Use Building: 7991   Mean   : 10729                     
##  Commercial Building         : 5007   3rd Qu.: 11222                     
##  (Other)                     :13103   Max.   :100354                     
##  NA's                        :    6   NA's   :336                        
##            Street_Name             Cross_Street_1   Cross_Street_2 
##  GRAND CONCOURSE :  713   AMSTERDAM AVENUE: 1617   BROADWAY: 2053  
##  BROADWAY        :  705   BROADWAY        : 1494   BEND    : 1456  
##  EASTERN PARKWAY :  632   BEND            : 1312   1 AVENUE: 1082  
##  LAFAYETTE AVENUE:  379   2 AVENUE        : 1128   7 AVENUE:  943  
##  GREENE AVENUE   :  366   3 AVENUE        : 1113   8 AVENUE:  914  
##  (Other)         :90044   (Other)         :78594   (Other) :78777  
##  NA's            : 9075   NA's            :16656   NA's    :16689  
##  Intersection_Street_1 Intersection_Street_2       Address_Type  
##  Length:101914         Length:101914         ADDRESS     :78090  
##  Class :character      Class :character      BLOCKFACE   : 4385  
##  Mode  :character      Mode  :character      INTERSECTION: 8777  
##                                              LATLONG     :10281  
##                                              PLACENAME   :   35  
##                                              NA's        :  346  
##                                                                  
##       Status         Due_Year       Due_Month        Due_Day     
##  Assigned: 4642   2016   :17243   08     :11929   Min.   : 1.00  
##  Closed  :74025   2017   :15343   07     :11691   1st Qu.: 8.00  
##  Draft   :    1   2015   :15012   09     :11263   Median :16.00  
##  Open    :    1   2014   :12391   06     :10472   Mean   :15.67  
##  Pending :23245   2013   :10858   10     : 9932   3rd Qu.:23.00  
##                   (Other):30950   (Other):46510   Max.   :31.00  
##                   NA's   :  117   NA's   :  117   NA's   :117    
##    Due_Time           RAU_Year       RAU_Month        RAU_Day     
##  Length:101914     2016   :17375   07     :12227   Min.   : 1.00  
##  Class1:hms        2015   :15361   08     :11537   1st Qu.: 8.00  
##  Class2:difftime   2017   :14505   06     :11340   Median :16.00  
##  Mode  :numeric    2014   :12557   09     :10657   Mean   :15.82  
##                    2011   :11160   05     : 9425   3rd Qu.:23.00  
##                    (Other):30953   (Other):46725   Max.   :31.00  
##                    NA's   :    3   NA's   :    3   NA's   :3      
##    RAU_Time        Community_Board_Number          Borough         Latitude    
##  Length:101914     Length:101914          BRONX        :20706   Min.   :40.50  
##  Class1:hms        Class :character       BROOKLYN     :34673   1st Qu.:40.68  
##  Class2:difftime   Mode  :character       MANHATTAN    :26803   Median :40.73  
##  Mode  :numeric                           QUEENS       :14811   Mean   :40.74  
##                                           STATEN ISLAND: 4920   3rd Qu.:40.82  
##                                           Unspecified  :    1   Max.   :40.91  
##                                                                 NA's   :706    
##    Longitude     
##  Min.   :-74.25  
##  1st Qu.:-73.97  
##  Median :-73.94  
##  Mean   :-73.93  
##  3rd Qu.:-73.90  
##  Max.   :-73.70  
##  NA's   :706

Where are the sightings more concentrated?

To answer this question, we will import an extension of the ggplot2 library, the ggmap library. If you don’t have the ggmap package installed, please do so by running install.packages("ggmap") on your R console.

library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
KEY = "AIzaSyDZb4hXrsdgQ7zjHoT1h6A26G7YbCV2jBc"
register_google(key = KEY)

Firstly, we take a look at how the sightings are distributed:

ggplot(rat_dataset, mapping = aes(y = Longitude)) +
  geom_boxplot() +
  theme_minimal()
## Warning: Removed 706 rows containing non-finite values (stat_boxplot).

ggplot(rat_dataset, mapping = aes(y = Latitude)) +
  geom_boxplot() +
  theme_minimal()
## Warning: Removed 706 rows containing non-finite values (stat_boxplot).

There is a concentration in the middle of the range, with many of what geom_boxplot() considers outliers. So the part of New York we will be looking at is:

data4lat <-
  rat_dataset %>% 
  select(Latitude) %>% 
  filter(!is.na(Latitude))

max_lat <- max(data4lat)
min_lat <- min(data4lat)

data4lon <-
  rat_dataset %>% 
  select(Longitude) %>% 
  filter(!is.na(Longitude))

max_lon <- max(data4lon)
min_lon <- min(data4lon)

newyork.map <- get_map(location= 'New York')
## Source : https://maps.googleapis.com/maps/api/staticmap?center=New%20York&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=New+York&key=xxx
sightings_map <-
  ggmap(newyork.map) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
sightings_map
## Warning: Removed 1 rows containing missing values (geom_rect).

From the distribution on the boxplots, we can assume that most of the sightings are concentrated in the middle of the map, which we can confirm by plotting the sightings. There is also a smaller concentration at the top middle of the map.

sightings_map <-
  ggmap(newyork.map) +
  geom_point(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude), alpha = 0.1) +
  stat_density2d(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude)) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
sightings_map
## Warning: Removed 706 rows containing non-finite values (stat_density2d).
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 706 rows containing missing values (geom_point).

It seems like the sightings are mostly concentrated in 3 boroughs: Manhatten, Bronx and Brooklyn, with Staten Island having the least concentration by far.

We can look at the numbers and see which Borough actually has more sightings (as that is a variable in our dataset too).

borough_population <-
  rat_dataset %>%
  ggplot(aes(x = Borough, fill = Borough)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), vjust=-1) +
  scale_fill_manual(values=c("#882E72", "#B178A6", "#114477", "#1965B0", 
                             "#5289C7", "#7BAFDE", "#4EB265", "#90C987", 
                             "#CAE0AB", "#F7EE55", "#F6C141", "#F1932D", 
                             "#E8601C", "#DC050C", "#D6C1DE", "#4477AA", 
                             "#77AADD", "#117755", "#44AA88", "#99CCBB", 
                             "#777711")) +
  ylim(0, 40000) +
  labs(x = "Borough", y = "Total Number of Rat Sightings", 
       title = "Number of Rat Sightings in Each Borough") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
borough_population

It looks like Brooklyn is the borough with the most rat sightings. Even though the map shows that Manhatten, for example, looks more densely populated, Manhatten is also a bigger borough, so the sightings are more evenly distributed throughout it.

How evenly exactly?

data4lat_brooklyn <-
  rat_dataset %>% 
  filter(!is.na(Latitude)) %>% 
  filter(Borough == "BROOKLYN") %>% 
  select(Latitude)

max_brooklyn_lat <- max(data4lat_brooklyn)
min_brooklyn_lat <- min(data4lat_brooklyn)

data4lon_brooklyn <-
  rat_dataset %>% 
  filter(!is.na(Longitude)) %>% 
  filter(Borough == "BROOKLYN") %>% 
  select(Longitude)

max_brooklyn_lon <- max(data4lon_brooklyn)
min_brooklyn_lon <- min(data4lon_brooklyn)

brooklyn_rats <-
  rat_dataset %>% 
  filter(Borough == "BROOKLYN")

brooklyn_map <-
  ggmap(newyork.map) +
  geom_point(data = brooklyn_rats, mapping = aes(x = Longitude, y = Latitude), color = "Red", size = 0.1) +
  ylim(min_brooklyn_lat, max_brooklyn_lat) +
  xlim(min_brooklyn_lon, max_brooklyn_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
brooklyn_map
## Warning: Removed 1 rows containing missing values (geom_rect).
## Warning: Removed 206 rows containing missing values (geom_point).

The rats seem to stand away from the beach and the parks (they probably have work to do, like us). The denser the subway lines get, the denser the rat population too, it seems.

And what type of location do these happen in, and how are they distributed on the map?

location_map <-
  ggmap(newyork.map) +
  geom_point(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
  facet_wrap(~ Location_Type) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map
## Warning: Removed 21 rows containing missing values (geom_rect).
## Warning: Removed 706 rows containing missing values (geom_point).

That does not seem like a great visualization (as you really can’t see anything), let’s divide our facets into multiple plots so we can take a closer look at each of them.

location_subset1 <- subset(rat_dataset, Location_Type %in% c("1-2 Family Dwelling", "1-2 Family Mixed Use Building", "3+ Family Apt. Building", "3+ Family Mixed Use Building"))
location_map1 <-
  ggmap(newyork.map) +
  geom_point(data = location_subset1, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
  facet_wrap(~ Location_Type, nrow = 2) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map1
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 195 rows containing missing values (geom_point).

location_subset2 <- subset(rat_dataset, Location_Type %in% c("Catch Basin/Sewer", "Commercial Building", "Construction Site", "Day Care/Nursery"))
location_map2 <-
  ggmap(newyork.map) +
  geom_point(data = location_subset2, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
  facet_wrap(~ Location_Type, nrow = 2) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map2
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 90 rows containing missing values (geom_point).

location_subset3 <- subset(rat_dataset, Location_Type %in% c("Government Building", "Hospital", "Office Building", "Other (Explain Below)"))
location_map3 <-
  ggmap(newyork.map) +
  geom_point(data = location_subset3, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
  facet_wrap(~ Location_Type, nrow = 2) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map3
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 348 rows containing missing values (geom_point).

location_subset4 <- subset(rat_dataset, Location_Type %in% c("Parking Lot/Garage", "Public Garden", "Public Stairs", "School/Pre-School"))
location_map4 <-
  ggmap(newyork.map) +
  geom_point(data = location_subset4, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
  facet_wrap(~ Location_Type, nrow = 2) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map4
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 26 rows containing missing values (geom_point).

location_subset5 <- subset(rat_dataset, Location_Type %in% c("Single Room Occupancy (SRO)", "Summer Camp", "Vacant Building", "Vacant Lot"))
location_map5 <-
  ggmap(newyork.map) +
  geom_point(data = location_subset5, mapping = aes(x = Longitude, y = Latitude, color = Location_Type), alpha = 0.3) +
  facet_wrap(~ Location_Type, nrow = 2) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
location_map5
## Warning: Removed 4 rows containing missing values (geom_rect).
## Warning: Removed 47 rows containing missing values (geom_point).

Looking at the Family Buildings, it is actually scary how evenly spread out (and still, always concentrated) around the maps. Even in Staten Island, which we already saw had the least concentration, we can still see that most of them were in Family Buildings. Looking at the map for the sightings at Sewers, it is no surprise the bigest concentration is in Manhatten, which is famously known in movies for having rats come out of sewers, as well as walking around stealing Dolar Slices, so you can see that Manhatten is also the one with more concentration in Public Spaces. It is also no surprise most of the sightings at Government Buildings are in Manhatten, as most of the Government buildings in New York are in Downtown Manhatten. There are no rat sightings in Office Buildings in Staten Island, as it is a Borough known to be mostly residential. For the “Other (Explain Below)”, it makes sense for this one to be the most densely populated (as there are many building types not listed as the primary ones) and the one with the sightings more evenly distributed (as there are “other” types of buildings everywhere).

Do the sighting locations change over the years?

years_map <-
  ggmap(newyork.map) +
  geom_point(data = rat_dataset, mapping = aes(x = Longitude, y = Latitude, color = Created_Year), alpha = 0.3) +
  facet_wrap(~ Created_Year) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
years_map
## Warning: Removed 8 rows containing missing values (geom_rect).
## Warning: Removed 706 rows containing missing values (geom_point).

It definitely looks like that is not the case, as the maps all look basically exactly the same, meaning the actual locations of the sightings are constant, which may indicate to no interventions ever having been done in any of the places.

Location Type and Rat number

Check how many location types it is.

unique(rat_dataset$Location_Type)
##  [1] 3+ Family Mixed Use Building  Commercial Building          
##  [3] 1-2 Family Dwelling           3+ Family Apt. Building      
##  [5] Public Stairs                 Other (Explain Below)        
##  [7] Vacant Lot                    Construction Site            
##  [9] Hospital                      Parking Lot/Garage           
## [11] Catch Basin/Sewer             Vacant Building              
## [13] 1-2 Family Mixed Use Building Public Garden                
## [15] Government Building           Office Building              
## [17] School/Pre-School             Day Care/Nursery             
## [19] Single Room Occupancy (SRO)   Summer Camp                  
## [21] <NA>                         
## 20 Levels: 1-2 Family Dwelling ... Vacant Lot
# 20 location types

There are 20 location types.

Compute the percentage of rat sighting in each location type

loc_per <- rat_dataset %>%
  group_by(Location_Type) %>%
  summarise(Percentage = round(100*(n() / nrow(.)), 1))

loc_per
## # A tibble: 21 × 2
##    Location_Type                 Percentage
##    <fct>                              <dbl>
##  1 1-2 Family Dwelling                 19.3
##  2 1-2 Family Mixed Use Building        1.7
##  3 3+ Family Apt. Building             40.3
##  4 3+ Family Mixed Use Building         7.8
##  5 Catch Basin/Sewer                    1.1
##  6 Commercial Building                  4.9
##  7 Construction Site                    2.2
##  8 Day Care/Nursery                     0.1
##  9 Government Building                  0.3
## 10 Hospital                             0.1
## # … with 11 more rows

Plot the bar chart that reveals the occupation of rat sighting in each location type

library(ggcharts)

loc_per %>%
  bar_chart(Location_Type, Percentage,
            bar_color = c("#882E72", "#B178A6", "#114477", "#1965B0", 
                          "#5289C7", "#7BAFDE", "#4EB265", "#90C987", 
                          "#CAE0AB", "#F7EE55", "#F6C141", "#F1932D", 
                          "#E8601C", "#DC050C", "#D6C1DE", "#4477AA", 
                          "#77AADD", "#117755", "#44AA88", "#99CCBB", 
                          "#777711")) +
  geom_text(aes(label = paste(Percentage , "%")),
            hjust = -0.2) +
  ylim(NA, 45) +
  theme(plot.title = element_text(hjust=0.5)) +
  labs(x = "", y = "", title = "Percentage of Rat Sighting in Different Location Type",
       fill = "Location Type")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

Following the implications of the bar chart, the most frequent cases of rat sightings occurred in apartment buildings with 3 or more family members, accounting for 40.3%. This is followed by residences with 1 to 2 family members at 19.3%. Notably, mixed-use buildings with 3 or more family members also accounted for 7.8%. Overall, the types of locations associated with families held the most complaints of rat sightings.

Time and Rat number

Plot the number of rat sightings each month

rat_dataset %>%
  ggplot(aes(x = Created_Month, fill = Created_Month)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), vjust=-1) +
  scale_fill_manual(values=c("#882E72", "#B178A6", "#114477", "#1965B0", 
                             "#5289C7", "#7BAFDE", "#4EB265", "#90C987", 
                             "#CAE0AB", "#F7EE55", "#F6C141", "#F1932D", 
                             "#E8601C", "#DC050C", "#D6C1DE", "#4477AA", 
                             "#77AADD", "#117755", "#44AA88", "#99CCBB", 
                             "#777711")) +
  ylim(0, 15000) +
  labs(x = "Month", y = "Total Number of Rat Sighting", 
       title = "Number of Rat Sighting Each Month", fill = "Month") +
  theme_minimal() +
  theme(plot.title = element_text(hjust=0.5))

In July, the highest number of rat sightings was recorded, with 11,982 cases, followed by August and June. In general, the number of rat sightings can be identified in relation to fluctuations in temperature, as in winter months, i.e. November, December, January and February, the number of rat sightings are the lowest. With rising temperatures, however, the number of rat sightings increased markedly.

Plot the Percentage of rat sightings by month

# compute the percentage of each month on rat sighting
month_per <- rat_dataset %>%
  group_by(Created_Month) %>%
  summarise(Percentage = round(100*(n() / nrow(.)),1))

month_per
## # A tibble: 12 × 2
##    Created_Month Percentage
##    <fct>              <dbl>
##  1 01                   5.6
##  2 02                   5.8
##  3 03                   7.6
##  4 04                   8.7
##  5 05                  10.6
##  6 06                  11.1
##  7 07                  11.8
##  8 08                  11.4
##  9 09                   9.5
## 10 10                   7.8
## 11 11                   5.3
## 12 12                   4.8
month_per %>%
  mutate(Month = Created_Month,
         Month = recode_factor(Month, '01' = "January", '02' = "Febuary",
                        '03' = "March", '04' = "April", '05' = "May", '06' = "June",
                        '07' = "July", '08' = "August", '09' = "September", 
                        '10' = "October", '11' = "November", '12' = "December")) %>%
  bar_chart(Month, Percentage,
            bar_color = c("#882E72", "#B178A6", "#114477", "#1965B0", 
                          "#5289C7", "#7BAFDE", "#4EB265", "#90C987", 
                          "#CAE0AB", "#F7EE55", "#F6C141", "#F1932D")) +
  geom_text(aes(label = paste(Percentage , "%")),
            hjust = -0.2) +
  ylim(NA, 15) +
  theme(plot.title = element_text(hjust=0.5)) +
  labs(x = "", y = "Percentage", title = "Percentage of Rat Sighting in Each Month",
       fill = "Month")
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

A monthly percentage of rat sightings clearly reveals that rat sightings can be roughly divided by season. Rat sightings are highest in the summer months, followed by spring and fall, and lowest in the winter months.

Determine whether the cases of rat sightings evolve with the year

Plot the number of rat sightings versus year

rat_dataset %>%
  ggplot(aes(x = Created_Year, fill = Created_Year)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), vjust=-1) +
  scale_fill_manual(values=c("#882E72", "#B178A6", "#5289C7", "#7BAFDE", 
                             "#CAE0AB", "#F7EE55", "#E8601C", "#DC050C"))  +
  ylim(0, 20000) +
  labs(x = "Year", y = "Total Number of Rat Sighting", 
       title = "Number of Rat Sighting 2010 - 2017", fill = "Year") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))

Since 2010 to 2013, the number of rat sighting cases did not fluctuate measurably. Yet, the number of rat sightings escalated yearly after 2013, from 2014 to 2016, with the number of rat sightings growing from 10,534 cases in 2010 to 17,230 cases in 2016. Possible reasons behind this are that New York residents tend to report rat sightings to the administration year by year or that the population of New York rises over the period of the years, thus causing the rat population to rise with it.

Another thing that needs to be mentioned here is that data collection was interrupted in September 2017, hence the drop in rat sightings in 2017.

Plot the line chart of rat sightings versus year to more easily observe changes

# Year with rat sighting (count)
year_count <- rat_dataset %>%
  group_by(Created_Year) %>%
  summarise(Number = n())

year_count
## # A tibble: 8 × 2
##   Created_Year Number
##   <fct>         <int>
## 1 2010          10534
## 2 2011          10454
## 3 2012          10643
## 4 2013          10739
## 5 2014          12617
## 6 2015          15272
## 7 2016          17230
## 8 2017          14425
# Year with rat sighting (count)
year_count %>%
  ggplot(aes(x = Created_Year, y = Number, group = 1)) +
  geom_point(size = 1.5) +
  geom_line(color = "dark blue", size = 1) +
  labs(x = "Year", y = "Number of Rat Sighting", 
       title = "Number of Rat Sighting 2010 - 2017", fill = "Year") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))

The line graph offers a better experience in observing the trend of rat sightings by year. The line graph reaffirms the above discussion, with an alarming increase in rat sightings from 2013 to 2016.

Mapping the number of rat sightings for each month of each year

# Number of Rat Sighting in each Months of Each Year
year_month_count <- rat_dataset %>%
  group_by(Created_Year) %>%
  count(Created_Month)
# Number of Rat Sighting in each Months of Each Year
year_month_count %>%
  ggplot(aes(x = Created_Month, y = n, color = Created_Year, group = Created_Year)) +
  geom_point() +
  geom_line() +
  scale_color_manual(values=c("#882E72", "black", "#EEA236", "#5CB85C", 
                              "#46B8DA", "#9632B8", "dark green", "#DC050C")) +
  labs(x = "Month", y = "Number of Rat Sighting", 
       title = "Number of Rat Sighting", color = "Year") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))

The graph reflects that the profile of the number of rat sightings is similar from year to year, with the most frequent rat sightings usually happening in the summer months as opposed to the less frequent rat sightings usually falling in the winter months. On the other hand, the number of rat sightings rose over time, in the case of 2017, even though the data are not yet complete, it remained noticeable that the overall number of rat sightings per month was much higher than in previous years.

Check how long it took the administration to solve a case of rat sighting

Compute the length of time to solve a complaint of a rat sighting

# combine the year, month, and day of started day
starttime <- as.Date(with(rat_dataset, paste(Created_Year, Created_Month, 
                                             Created_Day, sep = "-")), "%Y-%m-%d")

# combine the year, month, and day of closed day
endtime <- as.Date(with(rat_dataset, paste(Closed_Year, Closed_Month, 
                                             Closed_Day, sep = "-")), "%Y-%m-%d")

# compute for the duration taken to solve
solved_duration <- data.frame(difftime(endtime,starttime,units = "days"))
colnames(solved_duration)[1] <- "duration"
solved_duration$duration <- as.numeric(solved_duration$duration)

# merge year created year with solved time, in order to see if the government's efficient improve
solved_duration <- cbind(rat_dataset$Created_Year, solved_duration)
colnames(solved_duration)[1] <- "Sighting_Year"

# remove the negative number, as it cannot be negative time to solve the issue
solved_duration <- solved_duration %>%
  mutate(month_cat = cut(duration, breaks = c(0, 30, 60, 90, 120, 150, 180, 1000), 
                          labels = c("less than 1 month", "less than 2 month",
                               "less than 3 month", "less than 4 month",
                               "less than 5 month", "less than half year",
                               "more than half year")))

# merge it to the original dataset
rat_dataset <- cbind(rat_dataset, solved_duration)

solved_duration <-
  solved_duration %>% 
  drop_na()

Plot the time required to solve the problem

solved_duration %>%
  ggplot(aes(x = month_cat, fill = month_cat)) +
  geom_bar() +
  geom_text(stat='count', aes(label=..count..), vjust=-1) +
  scale_fill_manual(values=c("#882E72", "#5289C7", "#CAE0AB", 
                             "#E8601C", "#DC050C", "#D6C1DE", "#4477AA")) +
  ylim(0, 70000) +
  labs(x = "Time to Solve", y = "Cases Count", 
       title = "Time to Solve Rat Sighting", fill = "Month") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust = 0),
        plot.title = element_text(hjust=0.5))

Apparently, the majority of rat sighting complaints were resolved within 1 month, with 64,371 cases, followed by less than 2 months, with 5,599 cases, suggesting that the administration is efficient in tackling rat sighting problems.

Does the location matter when it comes to how long they take to solve a case?

duration_map <-
  ggmap(newyork.map) +
  geom_point(data = rat_dataset %>% filter_at(vars(Longitude, Latitude, month_cat),all_vars(!is.na(.))), mapping = aes(x = Longitude, y = Latitude, color = month_cat), alpha = 0.3) +
  facet_wrap(~ month_cat) +
  ylim(min_lat, max_lat) +
  xlim(min_lon, max_lon)
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
duration_map
## Warning: Removed 7 rows containing missing values (geom_rect).

The cases that take longer than 4 months to be closed are mostly localised in Manhatten, which makes sense, because, as you will see later, Mahantten is the borough with the biggest rat/population ratio.

Plot the number of rat sighting case resolution times by year.

# plot the duration taken to solved the issue of each year
solved_duration %>%
  filter(Sighting_Year == c(2010, 2011, 2012, 2013)) %>%
  ggplot(aes(x = month_cat, fill = month_cat)) +
  geom_bar(position=position_dodge(preserve = "single"), stat='count') +
  geom_text(stat='count', aes(label=..count..), vjust=-1) +
  scale_fill_manual(values=c("#882E72", "#5289C7", "#CAE0AB", 
                             "#E8601C", "#DC050C", "#D6C1DE", "#4477AA")) +
  ylim(0, 2300) +
  labs(x = "Time to Solve", y = "Cases Count", 
       title = "Time to Solve Rat Sighting", fill = "Month") +
  facet_grid(.~Sighting_Year) +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust = 0),
        plot.title = element_text(hjust=0.5))

# plot the duration taken to solved the issue of each year
solved_duration %>%
  filter(Sighting_Year == c(2014, 2015, 2016, 2017)) %>%
  ggplot(aes(x = month_cat, fill = month_cat)) +
  geom_bar(position=position_dodge(preserve = "single"), stat='count') +
  geom_text(stat='count', aes(label=..count..), vjust= -1) +
  scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB", 
                             "#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
                    labels = c("less than 1 month", "less than 2 month",
                               "less than 3 month", "less than 4 month",
                               "less than 5 month", "less than half year",
                               "more than half year")) +
  ylim(0, 3000) +
  labs(x = "Time to Solve", y = "Cases Count", 
       title = "Time to Solve Rat Sighting", fill = "Month") +
  facet_grid(.~Sighting_Year) +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust = 0),
        plot.title = element_text(hjust=0.5))

On balance, from 2010 to 2017, an increasing number of sighting cases were solved within 1 month, with the number of cases requiring more than 2 months to be closed decreasing each year.

Plot the percentage of rat sighting case resolution times by year

library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
solved_duration %>%
  filter(Sighting_Year == c(2010, 2011, 2012), duration > 0) %>%
  ggplot(aes(x= month_cat,  group=Sighting_Year)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
  geom_text(aes(label = scales::percent(round((..prop..),3)),
                y= ..prop.. ), 
            stat= "count", vjust = -0.5, hjust = 0.5, angle = 0, 
            posistion = position_dodge(width=0.9),  size = 3) +
  scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB", 
                               "#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
                    labels = c("less than 1 month", "less than 2 month",
                               "less than 3 month", "less than 4 month",
                               "less than 5 month", "less than half year",
                               "more than half year")) +
  ylim(0, 100) +
  labs(x = "Time to Solve", y = "Cases Count", 
       title = "Time to Solve Rat Sighting", fill = "Month") +
  facet_grid(~Sighting_Year) +
  scale_y_continuous(labels=percent) +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust = 0),
        plot.title = element_text(hjust=0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

solved_duration %>%
  filter(Sighting_Year == c(2013, 2014, 2015)) %>%
  ggplot(aes(x= month_cat,  group=Sighting_Year)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
  geom_text(aes(label = scales::percent(round((..prop..),3)),
                y= ..prop.. ), 
            stat= "count", vjust = -0.5, hjust = 0.5, angle = 0, 
            posistion = position_dodge(width=0.9),  size = 3) +
  scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB", 
                               "#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
                    labels = c("less than 1 month", "less than 2 month",
                               "less than 3 month", "less than 4 month",
                               "less than 5 month", "less than half year",
                               "more than half year")) +
  ylim(0, 100) +
  labs(x = "Time to Solve", y = "Cases Count", 
       title = "Time to Solve Rat Sighting", fill = "Month") +
  facet_grid(~Sighting_Year) +
  scale_y_continuous(labels=percent) +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust = 0),
        plot.title = element_text(hjust=0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

solved_duration %>%
  filter(Sighting_Year == c(2016, 2017)) %>%
  ggplot(aes(x= month_cat,  group=Sighting_Year)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
  geom_text(aes(label = scales::percent(round((..prop..),3)),
                y= ..prop.. ), 
            stat= "count", vjust = -0.5, hjust = 0.5, angle = 0, 
            posistion = position_dodge(width=0.9),  size = 4) +
  scale_fill_manual(values = c("#882E72", "#5289C7", "#CAE0AB", 
                               "#E8601C", "#DC050C", "#D6C1DE", "#4477AA"),
                    labels = c("less than 1 month", "less than 2 month",
                               "less than 3 month", "less than 4 month",
                               "less than 5 month", "less than half year",
                               "more than half year")) +
  ylim(0, 100) +
  labs(x = "Time to Solve", y = "Cases Count", 
       title = "Time to Solve Rat Sighting", fill = "Month") +
  facet_grid(~Sighting_Year) +
  scale_y_continuous(labels=percent) +
  theme(axis.text.x = element_text(angle = 90, 
                                   hjust = 1, vjust = 0),
        plot.title = element_text(hjust=0.5))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.

As seen in the graph, the overall efficiency of solving rat sighting cases has improved year by year, from 73.2% of cases resolved within 1 month in 2010 to approximately 80% of cases closed within 1 month in 2011, 2012, and further to 90% of cases settled within 1 month in 2013 to 2017.

Is the number of rat sightings in a given borough proportional to its population?

Quick side note: 2008 and 2009 have been ignored because they only have 18 and 1 rat sightings respectively. Also, when looking up the data; some borough have different names as counties, but exactly the same area, so one should look up the county names instead.

Borough_Rats <- 
  rat_dataset %>% 
  group_by(Borough, Due_Year) %>% 
  tally(sort = TRUE) %>% 
  pivot_wider(names_from = Due_Year, values_from = n) %>% 
  select(Borough, "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017") %>% 
  drop_na()

head(Borough_Rats)
## # A tibble: 5 × 9
## # Groups:   Borough [5]
##   Borough       `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017`
##   <fct>          <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
## 1 BROOKLYN        3237   3631   3574   3359   3887   5136   5999   5795
## 2 MANHATTAN       2734   2410   2812   3053   3467   4063   4648   3563
## 3 BRONX           1938   2145   2145   2110   2721   3158   3455   3026
## 4 QUEENS          1511   1582   1512   1687   1713   2065   2409   2316
## 5 STATEN ISLAND    619    550    530    649    603    590    732    643
Borough_Population <- 
  read_csv("New_York_City_Population_by_Borough__1950_-_2040.csv", show_col_types = FALSE) %>% 
  select("Borough", "Area", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017") %>% 
  mutate(Borough = toupper(Borough)) %>% 
  inner_join(Borough_Rats, by = "Borough") %>% 
  rename(Borough = "Borough", Area = "Area", Population_2010 = "2010.x", Rat_Sightings_2010 = "2010.y", Population_2011 = "2011.x", Rat_Sightings_2011 = "2011.y", Population_2012 = "2012.x", Rat_Sightings_2012 = "2012.y", Population_2013 = "2013.x", Rat_Sightings_2013 = "2013.y", Population_2014 = "2014.x", Rat_Sightings_2014 = "2014.y", Population_2015 = "2015.x", Rat_Sightings_2015 = "2015.y", Population_2016 = "2016.x", Rat_Sightings_2016 = "2016.y", Population_2017 = "2017.x", Rat_Sightings_2017 = "2017.y") %>%
  relocate(Borough, Area, Population_2010, Rat_Sightings_2010, Population_2011, Rat_Sightings_2011, Population_2012, Rat_Sightings_2012, Population_2013, Rat_Sightings_2013, Population_2014, Rat_Sightings_2014, Population_2015, Rat_Sightings_2015, Population_2016, Rat_Sightings_2016, Population_2017, Rat_Sightings_2017) #%>% 
## Warning: One or more parsing issues, see `problems()` for details
  #mutate(Sightings_Rate = round(((Rat_Sightings / Population) * 100000), digits = 2))
  

head(Borough_Population)
## # A tibble: 5 × 18
##   Borough   Area Popul…¹ Rat_S…² Popul…³ Rat_S…⁴ Popul…⁵ Rat_S…⁶ Popul…⁷ Rat_S…⁸
##   <chr>    <dbl>   <dbl>   <int>   <dbl>   <int>   <dbl>   <int>   <dbl>   <int>
## 1 BRONX    110   1385108    1938 1396954    2145 1411087    2145 1421498    2110
## 2 BROOKLYN 183.  2552911    3237 2540918    3631 2568538    3574 2587759    3359
## 3 MANHATT…  59.1 1585873    2734 1608717    2410 1624573    2812 1628379    3053
## 4 QUEENS   280   2250002    1511 2255261    1582 2271920    1512 2286788    1687
## 5 STATEN … 152    468730     619  471014     550  470597     530  471783     649
## # … with 8 more variables: Population_2014 <dbl>, Rat_Sightings_2014 <int>,
## #   Population_2015 <dbl>, Rat_Sightings_2015 <int>, Population_2016 <dbl>,
## #   Rat_Sightings_2016 <int>, Population_2017 <dbl>, Rat_Sightings_2017 <int>,
## #   and abbreviated variable names ¹​Population_2010, ²​Rat_Sightings_2010,
## #   ³​Population_2011, ⁴​Rat_Sightings_2011, ⁵​Population_2012,
## #   ⁶​Rat_Sightings_2012, ⁷​Population_2013, ⁸​Rat_Sightings_2013

##Creating two new databases; one for rats per population and one for rats per area

Rats_Per_Pop <-
  Borough_Population %>% 
  mutate('2010' = round(((Rat_Sightings_2010 / Population_2010) * 100000), digits = 2), '2011' = round(((Rat_Sightings_2011 / Population_2011) * 100000), digits = 2), '2012' = round(((Rat_Sightings_2012 / Population_2012) * 100000), digits = 2), '2013' = round(((Rat_Sightings_2013 / Population_2013) * 100000), digits = 2), '2014' = round(((Rat_Sightings_2014 / Population_2014) * 100000), digits = 2), '2015' = round(((Rat_Sightings_2015 / Population_2015) * 100000), digits = 2), '2016' = round(((Rat_Sightings_2016 / Population_2016) * 100000), digits = 2), '2017' = round(((Rat_Sightings_2017 / Population_2017) * 100000), digits = 2)) %>% 
  select(Borough, '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017')

head(Rats_Per_Pop)
## # A tibble: 5 × 9
##   Borough       `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017`
##   <chr>          <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 BRONX          140.   154.   152.   148.   190.   219.    239.   210.
## 2 BROOKLYN       127.   143.   139.   130.   149.   197.    230.   223.
## 3 MANHATTAN      172.   150.   173.   187.   212.   248.    284.     0 
## 4 QUEENS          67.2   70.2   66.6   73.8   74.5   89.6   104.   101.
## 5 STATEN ISLAND  132.   117.   113.   138.   128.   125.    154.   135.
Rats_Per_Km <-
  Borough_Population %>% 
  mutate('2010' = round((Rat_Sightings_2010 / Area), digits = 2), '2011' = round((Rat_Sightings_2011 / Area), digits = 2), '2012' = round((Rat_Sightings_2012 / Area), digits = 2), '2013' = round((Rat_Sightings_2013 / Area), digits = 2), '2014' = round((Rat_Sightings_2014 / Area), digits = 2), '2015' = round((Rat_Sightings_2015 / Area), digits = 2), '2016' = round((Rat_Sightings_2016 / Area), digits = 2), '2017' = round((Rat_Sightings_2017 / Area), digits = 2)) %>% 
  select(Borough, '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017')

head(Rats_Per_Km)
## # A tibble: 5 × 9
##   Borough       `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017`
##   <chr>          <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 BRONX          17.6   19.5   19.5   19.2   24.7   28.7   31.4   27.5 
## 2 BROOKLYN       17.6   19.8   19.5   18.3   21.2   28     32.7   31.6 
## 3 MANHATTAN      46.3   40.8   47.6   51.7   58.7   68.8   78.6   60.3 
## 4 QUEENS          5.4    5.65   5.4    6.03   6.12   7.38   8.6    8.27
## 5 STATEN ISLAND   4.07   3.62   3.49   4.27   3.97   3.88   4.82   4.23

##Visualization for above

P_Rates <-
  Rats_Per_Pop %>% 
  pivot_longer(c('2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'), names_to = "Year", values_to = "Rate") 

as.factor(P_Rates$Year)
##  [1] 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015 2016
## [16] 2017 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015
## [31] 2016 2017 2010 2011 2012 2013 2014 2015 2016 2017
## Levels: 2010 2011 2012 2013 2014 2015 2016 2017
head(P_Rates, n = 20)
## # A tibble: 20 × 3
##    Borough   Year   Rate
##    <chr>     <chr> <dbl>
##  1 BRONX     2010   140.
##  2 BRONX     2011   154.
##  3 BRONX     2012   152.
##  4 BRONX     2013   148.
##  5 BRONX     2014   190.
##  6 BRONX     2015   219.
##  7 BRONX     2016   239.
##  8 BRONX     2017   210.
##  9 BROOKLYN  2010   127.
## 10 BROOKLYN  2011   143.
## 11 BROOKLYN  2012   139.
## 12 BROOKLYN  2013   130.
## 13 BROOKLYN  2014   149.
## 14 BROOKLYN  2015   197.
## 15 BROOKLYN  2016   230.
## 16 BROOKLYN  2017   223.
## 17 MANHATTAN 2010   172.
## 18 MANHATTAN 2011   150.
## 19 MANHATTAN 2012   173.
## 20 MANHATTAN 2013   187.
P_Rates %>% 
  ggplot(aes(x=Year, y=Rate, color = Borough, group = Borough)) +
  geom_point() +
  geom_line() +
  labs(x = "Year", y = "Sightings per 100.000 population", title = "Rate of sightings by Borough population", color = "Borough") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))

P_Rates %>% 
  ggplot(aes(x=Year, y=Rate, group = Borough)) +
  geom_point() +
  geom_line() +
  facet_wrap(~ Borough) + 
  labs(x = "Year", y = "Sightings per 100.000 population", title = "Rate of sightings by Borough population") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))

A_Rates <-
  Rats_Per_Km %>% 
  pivot_longer(c('2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017'), names_to = "Year", values_to = "Rate") 

as.factor(A_Rates$Year)
##  [1] 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015 2016
## [16] 2017 2010 2011 2012 2013 2014 2015 2016 2017 2010 2011 2012 2013 2014 2015
## [31] 2016 2017 2010 2011 2012 2013 2014 2015 2016 2017
## Levels: 2010 2011 2012 2013 2014 2015 2016 2017
head(A_Rates, n = 20)
## # A tibble: 20 × 3
##    Borough   Year   Rate
##    <chr>     <chr> <dbl>
##  1 BRONX     2010   17.6
##  2 BRONX     2011   19.5
##  3 BRONX     2012   19.5
##  4 BRONX     2013   19.2
##  5 BRONX     2014   24.7
##  6 BRONX     2015   28.7
##  7 BRONX     2016   31.4
##  8 BRONX     2017   27.5
##  9 BROOKLYN  2010   17.6
## 10 BROOKLYN  2011   19.8
## 11 BROOKLYN  2012   19.5
## 12 BROOKLYN  2013   18.3
## 13 BROOKLYN  2014   21.2
## 14 BROOKLYN  2015   28  
## 15 BROOKLYN  2016   32.7
## 16 BROOKLYN  2017   31.6
## 17 MANHATTAN 2010   46.3
## 18 MANHATTAN 2011   40.8
## 19 MANHATTAN 2012   47.6
## 20 MANHATTAN 2013   51.7
A_Rates %>% 
  ggplot(aes(x=Year, y=Rate, color = Borough, group = Borough)) +
  geom_point() +
  geom_line() +
  labs(x = "Year", y = "Sightings per square kilometre", title = "Rate of sightings by Borough area", color = "Borough") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))

A_Rates %>% 
  ggplot(aes(x=Year, y=Rate, group = Borough)) +
  geom_point() +
  geom_line() +
  facet_wrap(~ Borough) +
  labs(x = "Year", y = "Sightings per square kilometre", title = "Rate of sightings by Borough area", color = "Borough") +
  theme_classic() +
  theme(plot.title = element_text(hjust=0.5))